{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ba8399d4",
   "metadata": {},
   "source": [
    "## 显存优化策略"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfcc1424",
   "metadata": {},
   "source": [
    "### 正常加载：6G显存跑不起来"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc649c81",
   "metadata": {},
   "source": [
    "运行chinese-macbert-large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2e18d898",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d5d7cd9bb68c475f848f470c5936b87e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d9edd840cff3427e912a65ab122f69e1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 第一步到第五步\n",
    "from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments\n",
    "from datasets import load_dataset\n",
    "dataset=load_dataset(\"csv\",data_files=\"../tst_fold\\ChnSentiCorp_htl_all copy.csv\",split=\"train\")\n",
    "dataset=dataset.filter(lambda x: x[\"review\"] is not None)\n",
    "datasets=dataset.train_test_split(test_size=0.1)\n",
    "import torch\n",
    "tokenizer=AutoTokenizer.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")\n",
    "def process_func(examples):\n",
    "    tokenized_examples=tokenizer(examples[\"review\"],max_length=128,truncation=True,padding=\"max_length\")\n",
    "    tokenized_examples[\"labels\"]=examples[\"label\"]\n",
    "    return tokenized_examples\n",
    "tokenized_datasets=datasets.map(process_func,batched=True,remove_columns=datasets[\"train\"].column_names)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "85cfd9ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "\"\"\"评估函数\"\"\"\n",
    "acc_metrics=evaluate.load(r\"../evaluate_main\\metrics\\accuracy\")\n",
    "f1_metrics=evaluate.load(r\"../evaluate_main\\metrics\\f1\")\n",
    "def eval_metric(eval_predict):\n",
    "    predictions,labels=eval_predict\n",
    "    predictions=predictions.argmax(axis=-1)\n",
    "    acc=acc_metrics.compute(predictions=predictions,references=labels)\n",
    "    f1=f1_metrics.compute(predictions=predictions,references=labels)\n",
    "    acc.update(f1)\n",
    "    return acc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4cf3bddc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "\"\"\"创建模型\"\"\"\n",
    "model=AutoModelForSequenceClassification.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")#模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "93b6dad4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TrainingArguments(\n",
       "_n_gpu=1,\n",
       "adafactor=False,\n",
       "adam_beta1=0.9,\n",
       "adam_beta2=0.999,\n",
       "adam_epsilon=1e-08,\n",
       "auto_find_batch_size=False,\n",
       "bf16=False,\n",
       "bf16_full_eval=False,\n",
       "data_seed=None,\n",
       "dataloader_drop_last=False,\n",
       "dataloader_num_workers=0,\n",
       "dataloader_pin_memory=True,\n",
       "ddp_bucket_cap_mb=None,\n",
       "ddp_find_unused_parameters=None,\n",
       "ddp_timeout=1800,\n",
       "debug=[],\n",
       "deepspeed=None,\n",
       "disable_tqdm=False,\n",
       "do_eval=True,\n",
       "do_predict=False,\n",
       "do_train=False,\n",
       "eval_accumulation_steps=None,\n",
       "eval_delay=0,\n",
       "eval_steps=None,\n",
       "evaluation_strategy=epoch,\n",
       "fp16=False,\n",
       "fp16_backend=auto,\n",
       "fp16_full_eval=False,\n",
       "fp16_opt_level=O1,\n",
       "fsdp=[],\n",
       "fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n",
       "fsdp_min_num_params=0,\n",
       "fsdp_transformer_layer_cls_to_wrap=None,\n",
       "full_determinism=False,\n",
       "gradient_accumulation_steps=1,\n",
       "gradient_checkpointing=False,\n",
       "greater_is_better=True,\n",
       "group_by_length=False,\n",
       "half_precision_backend=auto,\n",
       "hub_model_id=None,\n",
       "hub_private_repo=False,\n",
       "hub_strategy=every_save,\n",
       "hub_token=<HUB_TOKEN>,\n",
       "ignore_data_skip=False,\n",
       "include_inputs_for_metrics=False,\n",
       "jit_mode_eval=False,\n",
       "label_names=None,\n",
       "label_smoothing_factor=0.0,\n",
       "learning_rate=2e-05,\n",
       "length_column_name=length,\n",
       "load_best_model_at_end=True,\n",
       "local_rank=-1,\n",
       "log_level=passive,\n",
       "log_level_replica=warning,\n",
       "log_on_each_node=True,\n",
       "logging_dir=./checkpoints_Trainer\\runs\\May19_15-05-10_DESKTOP-22GG4OF,\n",
       "logging_first_step=False,\n",
       "logging_nan_inf_filter=True,\n",
       "logging_steps=10,\n",
       "logging_strategy=steps,\n",
       "lr_scheduler_type=linear,\n",
       "max_grad_norm=1.0,\n",
       "max_steps=-1,\n",
       "metric_for_best_model=f1,\n",
       "mp_parameters=,\n",
       "no_cuda=False,\n",
       "num_train_epochs=1,\n",
       "optim=adamw_hf,\n",
       "optim_args=None,\n",
       "output_dir=./checkpoints_Trainer,\n",
       "overwrite_output_dir=False,\n",
       "past_index=-1,\n",
       "per_device_eval_batch_size=32,\n",
       "per_device_train_batch_size=32,\n",
       "prediction_loss_only=False,\n",
       "push_to_hub=False,\n",
       "push_to_hub_model_id=None,\n",
       "push_to_hub_organization=None,\n",
       "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
       "ray_scope=last,\n",
       "remove_unused_columns=True,\n",
       "report_to=['tensorboard'],\n",
       "resume_from_checkpoint=None,\n",
       "run_name=./checkpoints_Trainer,\n",
       "save_on_each_node=False,\n",
       "save_steps=500,\n",
       "save_strategy=epoch,\n",
       "save_total_limit=3,\n",
       "seed=42,\n",
       "sharded_ddp=[],\n",
       "skip_memory_metrics=True,\n",
       "tf32=None,\n",
       "torch_compile=False,\n",
       "torch_compile_backend=None,\n",
       "torch_compile_mode=None,\n",
       "torchdynamo=None,\n",
       "tpu_metrics_debug=False,\n",
       "tpu_num_cores=None,\n",
       "use_ipex=False,\n",
       "use_legacy_prediction_loop=False,\n",
       "use_mps_device=False,\n",
       "warmup_ratio=0.0,\n",
       "warmup_steps=0,\n",
       "weight_decay=0.01,\n",
       "xpu_backend=None,\n",
       ")"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_args=TrainingArguments(output_dir=\"./checkpoints_Trainer\",\n",
    "                             per_device_train_batch_size=32,#训练集 dataloader每个批次大小\n",
    "                             per_device_eval_batch_size=32,#测试集 dataloader 每个批次大小\n",
    "                             num_train_epochs=1, #总轮数\n",
    "                             logging_steps=10, # 每十步进行一次打印\n",
    "                             #epoch:每一轮做一次评估\n",
    "                             #steps，需要搭配eval_steps=xxx使用:没xxx步评估一次\n",
    "                             evaluation_strategy=\"epoch\",\n",
    "                             #保存策略\n",
    "                             save_strategy=\"epoch\",#每一轮保存一下模型\n",
    "                             save_total_limit=3,#限制最多保存5个模型\n",
    "                             learning_rate=2e-5,\n",
    "                             weight_decay=0.01,#梯度衰减\n",
    "                             metric_for_best_model=\"f1\",#设置评估指标\n",
    "                             load_best_model_at_end=True#最后默认加载最好的 模型 \n",
    "                            #  optim= 设置优化器\n",
    "                             )\n",
    "train_args#查看参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "13d139f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorWithPadding\n",
    "\"\"\"传入模型，训练参数，数据集\"\"\"\n",
    "trainer=Trainer(model=model,args=train_args,train_dataset=tokenized_datasets[\"train\"],\n",
    "                eval_dataset=tokenized_datasets[\"test\"],\n",
    "                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                compute_metrics=eval_metric \n",
    "                )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a1d655e3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "11bc29c6cc8a4ff395433b7061b853da",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/219 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "OutOfMemoryError",
     "evalue": "CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 6.00 GiB total capacity; 5.31 GiB already allocated; 0 bytes free; 5.32 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[14], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;66;03m#报错，超出显存\u001b[39;00m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1633\u001b[0m, in \u001b[0;36mTrainer.train\u001b[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[0;32m   1628\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[0;32m   1630\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[0;32m   1631\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[0;32m   1632\u001b[0m )\n\u001b[1;32m-> 1633\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1634\u001b[0m \u001b[43m    \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1635\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1636\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1637\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1638\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1902\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[0;32m   1900\u001b[0m         tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_step(model, inputs)\n\u001b[0;32m   1901\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1902\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1904\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   1905\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[0;32m   1906\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[0;32m   1907\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[0;32m   1908\u001b[0m ):\n\u001b[0;32m   1909\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[0;32m   1910\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:2645\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[1;34m(self, model, inputs)\u001b[0m\n\u001b[0;32m   2642\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m   2644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[1;32m-> 2645\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2647\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m   2648\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[1;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[0;32m   2675\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   2676\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 2677\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs)\n\u001b[0;32m   2678\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[0;32m   2679\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[0;32m   2680\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:1562\u001b[0m, in \u001b[0;36mBertForSequenceClassification.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m   1554\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1555\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[0;32m   1556\u001b[0m \u001b[38;5;124;03m    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;124;03m    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;124;03m    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1560\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m-> 1562\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbert\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1563\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1565\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1566\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1567\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1568\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1569\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1570\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1571\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1572\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1574\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m   1576\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(pooled_output)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:1013\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m   1006\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[0;32m   1007\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[0;32m   1008\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[0;32m   1009\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[0;32m   1010\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[0;32m   1011\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[1;32m-> 1013\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membeddings\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1014\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1015\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1016\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1017\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1018\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1019\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1020\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder(\n\u001b[0;32m   1021\u001b[0m     embedding_output,\n\u001b[0;32m   1022\u001b[0m     attention_mask\u001b[38;5;241m=\u001b[39mextended_attention_mask,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1030\u001b[0m     return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[0;32m   1031\u001b[0m )\n\u001b[0;32m   1032\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:231\u001b[0m, in \u001b[0;36mBertEmbeddings.forward\u001b[1;34m(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)\u001b[0m\n\u001b[0;32m    229\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inputs_embeds \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m    230\u001b[0m     inputs_embeds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mword_embeddings(input_ids)\n\u001b[1;32m--> 231\u001b[0m token_type_embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoken_type_embeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    233\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m inputs_embeds \u001b[38;5;241m+\u001b[39m token_type_embeddings\n\u001b[0;32m    234\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mposition_embedding_type \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mabsolute\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\sparse.py:160\u001b[0m, in \u001b[0;36mEmbedding.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m    159\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[1;32m--> 160\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    161\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    162\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\functional.py:2210\u001b[0m, in \u001b[0;36membedding\u001b[1;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[0;32m   2204\u001b[0m     \u001b[38;5;66;03m# Note [embedding_renorm set_grad_enabled]\u001b[39;00m\n\u001b[0;32m   2205\u001b[0m     \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[0;32m   2206\u001b[0m     \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[0;32m   2207\u001b[0m     \u001b[38;5;66;03m#   torch.embedding_renorm_\u001b[39;00m\n\u001b[0;32m   2208\u001b[0m     \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[0;32m   2209\u001b[0m     _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[1;32m-> 2210\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[1;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 6.00 GiB total capacity; 5.31 GiB already allocated; 0 bytes free; 5.32 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
     ]
    }
   ],
   "source": [
    "trainer.train()#报错，超出显存"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd3123dd",
   "metadata": {},
   "source": [
    "### 设置梯度累加(Gradient Accumulation)，与train,val的batch_size"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21fcbef4",
   "metadata": {},
   "source": [
    "per_device_train_batch_size=1,<br>\n",
    "gradient_accumulation_steps=32,<br>\n",
    "per_device_eval_batch_size=1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b411c65",
   "metadata": {},
   "source": [
    "通过梯度累积，每32个小批次（每个批次大小为1）的梯度累加后，才更新一次模型参数。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2c0a89c6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d0239b4f0546433ba973ff151a6037f8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "894003cb7ccb46e38a1c55d17294dcb2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "z:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4abf67c682414f38960918d9340ab0e5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/218 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 0.6231, 'learning_rate': 1.9082568807339454e-05, 'epoch': 0.05}\n",
      "{'loss': 0.4527, 'learning_rate': 1.81651376146789e-05, 'epoch': 0.09}\n",
      "{'loss': 0.2721, 'learning_rate': 1.724770642201835e-05, 'epoch': 0.14}\n",
      "{'loss': 0.3404, 'learning_rate': 1.63302752293578e-05, 'epoch': 0.18}\n",
      "{'loss': 0.323, 'learning_rate': 1.541284403669725e-05, 'epoch': 0.23}\n",
      "{'loss': 0.2156, 'learning_rate': 1.4495412844036698e-05, 'epoch': 0.27}\n",
      "{'loss': 0.2274, 'learning_rate': 1.3577981651376149e-05, 'epoch': 0.32}\n",
      "{'loss': 0.2723, 'learning_rate': 1.2660550458715597e-05, 'epoch': 0.37}\n",
      "{'loss': 0.2186, 'learning_rate': 1.1743119266055047e-05, 'epoch': 0.41}\n",
      "{'loss': 0.2405, 'learning_rate': 1.0825688073394496e-05, 'epoch': 0.46}\n",
      "{'loss': 0.19, 'learning_rate': 9.908256880733946e-06, 'epoch': 0.5}\n",
      "{'loss': 0.2471, 'learning_rate': 8.990825688073395e-06, 'epoch': 0.55}\n",
      "{'loss': 0.23, 'learning_rate': 8.073394495412845e-06, 'epoch': 0.6}\n",
      "{'loss': 0.2051, 'learning_rate': 7.155963302752295e-06, 'epoch': 0.64}\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 56\u001b[0m\n\u001b[0;32m     50\u001b[0m \u001b[38;5;124;03m\"\"\"传入模型，训练参数，数据集\"\"\"\u001b[39;00m\n\u001b[0;32m     51\u001b[0m trainer\u001b[38;5;241m=\u001b[39mTrainer(model\u001b[38;5;241m=\u001b[39mmodel,args\u001b[38;5;241m=\u001b[39mtrain_args,train_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     52\u001b[0m                 eval_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     53\u001b[0m                 data_collator\u001b[38;5;241m=\u001b[39mDataCollatorWithPadding(tokenizer\u001b[38;5;241m=\u001b[39mtokenizer),\n\u001b[0;32m     54\u001b[0m                 compute_metrics\u001b[38;5;241m=\u001b[39meval_metric \n\u001b[0;32m     55\u001b[0m                 )\n\u001b[1;32m---> 56\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1633\u001b[0m, in \u001b[0;36mTrainer.train\u001b[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[0;32m   1628\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[0;32m   1630\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[0;32m   1631\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[0;32m   1632\u001b[0m )\n\u001b[1;32m-> 1633\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1634\u001b[0m \u001b[43m    \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1635\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1636\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1637\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1638\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1902\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[0;32m   1900\u001b[0m         tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_step(model, inputs)\n\u001b[0;32m   1901\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1902\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1904\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   1905\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[0;32m   1906\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[0;32m   1907\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[0;32m   1908\u001b[0m ):\n\u001b[0;32m   1909\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[0;32m   1910\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:2645\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[1;34m(self, model, inputs)\u001b[0m\n\u001b[0;32m   2642\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m   2644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[1;32m-> 2645\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2647\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m   2648\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[1;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[0;32m   2675\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   2676\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 2677\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs)\n\u001b[0;32m   2678\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[0;32m   2679\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[0;32m   2680\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:1562\u001b[0m, in \u001b[0;36mBertForSequenceClassification.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m   1554\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1555\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[0;32m   1556\u001b[0m \u001b[38;5;124;03m    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;124;03m    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;124;03m    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1560\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m-> 1562\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbert\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1563\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1565\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1566\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1567\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1568\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1569\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1570\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1571\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1572\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1574\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m   1576\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(pooled_output)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:993\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m    989\u001b[0m         token_type_ids \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mzeros(input_shape, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mlong, device\u001b[38;5;241m=\u001b[39mdevice)\n\u001b[0;32m    991\u001b[0m \u001b[38;5;66;03m# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]\u001b[39;00m\n\u001b[0;32m    992\u001b[0m \u001b[38;5;66;03m# ourselves in which case we just need to make it broadcastable to all heads.\u001b[39;00m\n\u001b[1;32m--> 993\u001b[0m extended_attention_mask: torch\u001b[38;5;241m.\u001b[39mTensor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_extended_attention_mask\u001b[49m\u001b[43m(\u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_shape\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    995\u001b[0m \u001b[38;5;66;03m# If a 2D or 3D attention mask is provided for the cross-attention\u001b[39;00m\n\u001b[0;32m    996\u001b[0m \u001b[38;5;66;03m# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]\u001b[39;00m\n\u001b[0;32m    997\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mis_decoder \u001b[38;5;129;01mand\u001b[39;00m encoder_hidden_states \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\modeling_utils.py:858\u001b[0m, in \u001b[0;36mModuleUtilsMixin.get_extended_attention_mask\u001b[1;34m(self, attention_mask, input_shape, device, dtype)\u001b[0m\n\u001b[0;32m    852\u001b[0m \u001b[38;5;66;03m# Since attention_mask is 1.0 for positions we want to attend and 0.0 for\u001b[39;00m\n\u001b[0;32m    853\u001b[0m \u001b[38;5;66;03m# masked positions, this operation will create a tensor which is 0.0 for\u001b[39;00m\n\u001b[0;32m    854\u001b[0m \u001b[38;5;66;03m# positions we want to attend and the dtype's smallest value for masked positions.\u001b[39;00m\n\u001b[0;32m    855\u001b[0m \u001b[38;5;66;03m# Since we are adding it to the raw scores before the softmax, this is\u001b[39;00m\n\u001b[0;32m    856\u001b[0m \u001b[38;5;66;03m# effectively the same as removing these entirely.\u001b[39;00m\n\u001b[0;32m    857\u001b[0m extended_attention_mask \u001b[38;5;241m=\u001b[39m extended_attention_mask\u001b[38;5;241m.\u001b[39mto(dtype\u001b[38;5;241m=\u001b[39mdtype)  \u001b[38;5;66;03m# fp16 compatibility\u001b[39;00m\n\u001b[1;32m--> 858\u001b[0m extended_attention_mask \u001b[38;5;241m=\u001b[39m (\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mextended_attention_mask\u001b[49m) \u001b[38;5;241m*\u001b[39m torch\u001b[38;5;241m.\u001b[39mfinfo(dtype)\u001b[38;5;241m.\u001b[39mmin\n\u001b[0;32m    859\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m extended_attention_mask\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\_tensor.py:39\u001b[0m, in \u001b[0;36m_handle_torch_function_and_wrap_type_error_to_not_implemented.<locals>.wrapped\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     37\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m has_torch_function(args):\n\u001b[0;32m     38\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(wrapped, args, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m---> 39\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m f(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m     40\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m     41\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mNotImplemented\u001b[39m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\_tensor.py:834\u001b[0m, in \u001b[0;36mTensor.__rsub__\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m    832\u001b[0m \u001b[38;5;129m@_handle_torch_function_and_wrap_type_error_to_not_implemented\u001b[39m\n\u001b[0;32m    833\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__rsub__\u001b[39m(\u001b[38;5;28mself\u001b[39m, other):\n\u001b[1;32m--> 834\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_VariableFunctions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrsub\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mother\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# 第一步到第五步\n",
    "from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments\n",
    "from datasets import load_dataset\n",
    "dataset=load_dataset(\"csv\",data_files=\"../tst_fold\\ChnSentiCorp_htl_all copy.csv\",split=\"train\")\n",
    "dataset=dataset.filter(lambda x: x[\"review\"] is not None)\n",
    "datasets=dataset.train_test_split(test_size=0.1)\n",
    "import torch\n",
    "tokenizer=AutoTokenizer.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")\n",
    "def process_func(examples):\n",
    "    tokenized_examples=tokenizer(examples[\"review\"],max_length=128,truncation=True,padding=\"max_length\")\n",
    "    tokenized_examples[\"labels\"]=examples[\"label\"]\n",
    "    return tokenized_examples\n",
    "tokenized_datasets=datasets.map(process_func,batched=True,remove_columns=datasets[\"train\"].column_names)\n",
    "import evaluate\n",
    "\"\"\"评估函数\"\"\"\n",
    "acc_metrics=evaluate.load(r\"../evaluate_main\\metrics\\accuracy\")\n",
    "f1_metrics=evaluate.load(r\"../evaluate_main\\metrics\\f1\")\n",
    "def eval_metric(eval_predict):\n",
    "    predictions,labels=eval_predict\n",
    "    predictions=predictions.argmax(axis=-1)\n",
    "    acc=acc_metrics.compute(predictions=predictions,references=labels)\n",
    "    f1=f1_metrics.compute(predictions=predictions,references=labels)\n",
    "    acc.update(f1)\n",
    "    return acc\n",
    "\n",
    "\"\"\"创建模型\"\"\"\n",
    "model=AutoModelForSequenceClassification.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")#模型\n",
    "\n",
    "\n",
    "train_args=TrainingArguments(output_dir=\"./checkpoints_Trainer\",\n",
    "                             per_device_train_batch_size=1,#！！！！训练集 dataloader每个批次大小\n",
    "                             gradient_accumulation_steps=32,#！！！！设置梯度累加\n",
    "                             per_device_eval_batch_size=1,#！！！测试集 dataloader 每个批次大小\n",
    "                             num_train_epochs=1, #总轮数\n",
    "                             logging_steps=10, # 每十步进行一次打印\n",
    "                             #epoch:每一轮做一次评估\n",
    "                             #steps，需要搭配eval_steps=xxx使用:没xxx步评估一次\n",
    "                             evaluation_strategy=\"epoch\",\n",
    "                             #保存策略\n",
    "                             save_strategy=\"epoch\",#每一轮保存一下模型\n",
    "                             save_total_limit=3,#限制最多保存5个模型\n",
    "                             learning_rate=2e-5,\n",
    "                             weight_decay=0.01,#梯度衰减\n",
    "                             metric_for_best_model=\"f1\",#设置评估指标\n",
    "                             load_best_model_at_end=True#最后默认加载最好的 模型 \n",
    "                            #  optim= 设置优化器\n",
    "                             )\n",
    "train_args#查看参数\n",
    "from transformers import DataCollatorWithPadding\n",
    "\"\"\"传入模型，训练参数，数据集\"\"\"\n",
    "trainer=Trainer(model=model,args=train_args,train_dataset=tokenized_datasets[\"train\"],\n",
    "                eval_dataset=tokenized_datasets[\"test\"],\n",
    "                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                compute_metrics=eval_metric \n",
    "                )\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2375fe13",
   "metadata": {},
   "source": [
    "### gradiant checkpoints"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b3b1073",
   "metadata": {},
   "source": [
    "当设置 gradient_checkpointing=True 时，显存占用减少的核心原理是 通过牺牲部分计算时间，减少前向传播过程中中间激活值（activations）的存储需求"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "209730b6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fc75865bb17f41948b89fa004f156657",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "098c5e37fd724e9e9256d96c4fed4e69",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "z:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "57b5020f39934d74b997219b1c49c6d3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/218 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 0.5954, 'learning_rate': 1.9082568807339454e-05, 'epoch': 0.05}\n",
      "{'loss': 0.3304, 'learning_rate': 1.81651376146789e-05, 'epoch': 0.09}\n",
      "{'loss': 0.3522, 'learning_rate': 1.724770642201835e-05, 'epoch': 0.14}\n",
      "{'loss': 0.3344, 'learning_rate': 1.63302752293578e-05, 'epoch': 0.18}\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 57\u001b[0m\n\u001b[0;32m     51\u001b[0m \u001b[38;5;124;03m\"\"\"传入模型，训练参数，数据集\"\"\"\u001b[39;00m\n\u001b[0;32m     52\u001b[0m trainer\u001b[38;5;241m=\u001b[39mTrainer(model\u001b[38;5;241m=\u001b[39mmodel,args\u001b[38;5;241m=\u001b[39mtrain_args,train_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     53\u001b[0m                 eval_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     54\u001b[0m                 data_collator\u001b[38;5;241m=\u001b[39mDataCollatorWithPadding(tokenizer\u001b[38;5;241m=\u001b[39mtokenizer),\n\u001b[0;32m     55\u001b[0m                 compute_metrics\u001b[38;5;241m=\u001b[39meval_metric \n\u001b[0;32m     56\u001b[0m                 )\n\u001b[1;32m---> 57\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1633\u001b[0m, in \u001b[0;36mTrainer.train\u001b[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[0;32m   1628\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[0;32m   1630\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[0;32m   1631\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[0;32m   1632\u001b[0m )\n\u001b[1;32m-> 1633\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1634\u001b[0m \u001b[43m    \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1635\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1636\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1637\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1638\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1902\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[0;32m   1900\u001b[0m         tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_step(model, inputs)\n\u001b[0;32m   1901\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1902\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1904\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   1905\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[0;32m   1906\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[0;32m   1907\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[0;32m   1908\u001b[0m ):\n\u001b[0;32m   1909\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[0;32m   1910\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:2663\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[1;34m(self, model, inputs)\u001b[0m\n\u001b[0;32m   2661\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdeepspeed\u001b[38;5;241m.\u001b[39mbackward(loss)\n\u001b[0;32m   2662\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 2663\u001b[0m     \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2665\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\u001b[38;5;241m.\u001b[39mdetach()\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\_tensor.py:488\u001b[0m, in \u001b[0;36mTensor.backward\u001b[1;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[0;32m    478\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m    479\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m    480\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[0;32m    481\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    486\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[0;32m    487\u001b[0m     )\n\u001b[1;32m--> 488\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    489\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[0;32m    490\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\autograd\\__init__.py:197\u001b[0m, in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[0;32m    192\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[0;32m    194\u001b[0m \u001b[38;5;66;03m# The reason we repeat same the comment below is that\u001b[39;00m\n\u001b[0;32m    195\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[0;32m    196\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[1;32m--> 197\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[0;32m    198\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    199\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# 第一步到第五步\n",
    "from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments\n",
    "from datasets import load_dataset\n",
    "dataset=load_dataset(\"csv\",data_files=\"../tst_fold\\ChnSentiCorp_htl_all copy.csv\",split=\"train\")\n",
    "dataset=dataset.filter(lambda x: x[\"review\"] is not None)\n",
    "datasets=dataset.train_test_split(test_size=0.1)\n",
    "import torch\n",
    "tokenizer=AutoTokenizer.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")\n",
    "def process_func(examples):\n",
    "    tokenized_examples=tokenizer(examples[\"review\"],max_length=128,truncation=True,padding=\"max_length\")\n",
    "    tokenized_examples[\"labels\"]=examples[\"label\"]\n",
    "    return tokenized_examples\n",
    "tokenized_datasets=datasets.map(process_func,batched=True,remove_columns=datasets[\"train\"].column_names)\n",
    "import evaluate\n",
    "\"\"\"评估函数\"\"\"\n",
    "acc_metrics=evaluate.load(r\"../evaluate_main\\metrics\\accuracy\")\n",
    "f1_metrics=evaluate.load(r\"../evaluate_main\\metrics\\f1\")\n",
    "def eval_metric(eval_predict):\n",
    "    predictions,labels=eval_predict\n",
    "    predictions=predictions.argmax(axis=-1)\n",
    "    acc=acc_metrics.compute(predictions=predictions,references=labels)\n",
    "    f1=f1_metrics.compute(predictions=predictions,references=labels)\n",
    "    acc.update(f1)\n",
    "    return acc\n",
    "\n",
    "\"\"\"创建模型\"\"\"\n",
    "model=AutoModelForSequenceClassification.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")#模型\n",
    "\n",
    "\n",
    "train_args=TrainingArguments(output_dir=\"./checkpoints_Trainer\",\n",
    "                             per_device_train_batch_size=1,#！！！！训练集 dataloader每个批次大小\n",
    "                             gradient_accumulation_steps=32,#！！！！设置梯度累加\n",
    "                             per_device_eval_batch_size=1,#！！！测试集 dataloader 每个批次大小\n",
    "                             gradient_checkpointing=True, #通过 以时间换空间 的策略，显著减少了激活值的显存占用\n",
    "                             num_train_epochs=1, #总轮数\n",
    "                             logging_steps=10, # 每十步进行一次打印\n",
    "                             #epoch:每一轮做一次评估\n",
    "                             #steps，需要搭配eval_steps=xxx使用:没xxx步评估一次\n",
    "                             evaluation_strategy=\"epoch\",\n",
    "                             #保存策略\n",
    "                             save_strategy=\"epoch\",#每一轮保存一下模型\n",
    "                             save_total_limit=3,#限制最多保存5个模型\n",
    "                             learning_rate=2e-5,\n",
    "                             weight_decay=0.01,#梯度衰减\n",
    "                             metric_for_best_model=\"f1\",#设置评估指标\n",
    "                             load_best_model_at_end=True#最后默认加载最好的 模型 \n",
    "                            #  optim= 设置优化器\n",
    "                             )\n",
    "train_args#查看参数\n",
    "from transformers import DataCollatorWithPadding\n",
    "\"\"\"传入模型，训练参数，数据集\"\"\"\n",
    "trainer=Trainer(model=model,args=train_args,train_dataset=tokenized_datasets[\"train\"],\n",
    "                eval_dataset=tokenized_datasets[\"test\"],\n",
    "                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                compute_metrics=eval_metric \n",
    "                )\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d71e652a",
   "metadata": {},
   "source": [
    "## Adafactor Optimizer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e827f18c",
   "metadata": {},
   "source": [
    "设置 optim=\"adafactor\" 后显存占用减少的核心原因在于 Adafactor优化器通过简化优化器状态的计算和存储机制，显著降低了显存需求"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "204cf342",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "70a4b88a08ae45f1a13378ded544e200",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "09f774de72e94d9db35f9bc3abcc0996",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5e0af05cf0c143dba5e0648805b44f6b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/218 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 0.6168, 'learning_rate': 1.9082568807339454e-05, 'epoch': 0.05}\n",
      "{'loss': 0.3748, 'learning_rate': 1.81651376146789e-05, 'epoch': 0.09}\n",
      "{'loss': 0.3708, 'learning_rate': 1.724770642201835e-05, 'epoch': 0.14}\n",
      "{'loss': 0.3471, 'learning_rate': 1.63302752293578e-05, 'epoch': 0.18}\n",
      "{'loss': 0.335, 'learning_rate': 1.541284403669725e-05, 'epoch': 0.23}\n",
      "{'loss': 0.2733, 'learning_rate': 1.4495412844036698e-05, 'epoch': 0.27}\n",
      "{'loss': 0.2979, 'learning_rate': 1.3577981651376149e-05, 'epoch': 0.32}\n",
      "{'loss': 0.3207, 'learning_rate': 1.2660550458715597e-05, 'epoch': 0.37}\n",
      "{'loss': 0.2361, 'learning_rate': 1.1743119266055047e-05, 'epoch': 0.41}\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 58\u001b[0m\n\u001b[0;32m     52\u001b[0m \u001b[38;5;124;03m\"\"\"传入模型，训练参数，数据集\"\"\"\u001b[39;00m\n\u001b[0;32m     53\u001b[0m trainer\u001b[38;5;241m=\u001b[39mTrainer(model\u001b[38;5;241m=\u001b[39mmodel,args\u001b[38;5;241m=\u001b[39mtrain_args,train_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     54\u001b[0m                 eval_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     55\u001b[0m                 data_collator\u001b[38;5;241m=\u001b[39mDataCollatorWithPadding(tokenizer\u001b[38;5;241m=\u001b[39mtokenizer),\n\u001b[0;32m     56\u001b[0m                 compute_metrics\u001b[38;5;241m=\u001b[39meval_metric \n\u001b[0;32m     57\u001b[0m                 )\n\u001b[1;32m---> 58\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1633\u001b[0m, in \u001b[0;36mTrainer.train\u001b[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[0;32m   1628\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[0;32m   1630\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[0;32m   1631\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[0;32m   1632\u001b[0m )\n\u001b[1;32m-> 1633\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1634\u001b[0m \u001b[43m    \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1635\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1636\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1637\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1638\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1904\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[0;32m   1901\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   1902\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_step(model, inputs)\n\u001b[1;32m-> 1904\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   1905\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[0;32m   1906\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[0;32m   1907\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[0;32m   1908\u001b[0m ):\n\u001b[0;32m   1909\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[0;32m   1910\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n\u001b[0;32m   1911\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# 第一步到第五步\n",
    "from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments\n",
    "from datasets import load_dataset\n",
    "dataset=load_dataset(\"csv\",data_files=\"../tst_fold\\ChnSentiCorp_htl_all copy.csv\",split=\"train\")\n",
    "dataset=dataset.filter(lambda x: x[\"review\"] is not None)\n",
    "datasets=dataset.train_test_split(test_size=0.1)\n",
    "import torch\n",
    "tokenizer=AutoTokenizer.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")\n",
    "def process_func(examples):\n",
    "    tokenized_examples=tokenizer(examples[\"review\"],max_length=128,truncation=True,padding=\"max_length\")\n",
    "    tokenized_examples[\"labels\"]=examples[\"label\"]\n",
    "    return tokenized_examples\n",
    "tokenized_datasets=datasets.map(process_func,batched=True,remove_columns=datasets[\"train\"].column_names)\n",
    "import evaluate\n",
    "\"\"\"评估函数\"\"\"\n",
    "acc_metrics=evaluate.load(r\"../evaluate_main\\metrics\\accuracy\")\n",
    "f1_metrics=evaluate.load(r\"../evaluate_main\\metrics\\f1\")\n",
    "def eval_metric(eval_predict):\n",
    "    predictions,labels=eval_predict\n",
    "    predictions=predictions.argmax(axis=-1)\n",
    "    acc=acc_metrics.compute(predictions=predictions,references=labels)\n",
    "    f1=f1_metrics.compute(predictions=predictions,references=labels)\n",
    "    acc.update(f1)\n",
    "    return acc\n",
    "\n",
    "\"\"\"创建模型\"\"\"\n",
    "model=AutoModelForSequenceClassification.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")#模型\n",
    "\n",
    "\n",
    "train_args=TrainingArguments(output_dir=\"./checkpoints_Trainer\",\n",
    "                             per_device_train_batch_size=1,#！！！！训练集 dataloader每个批次大小\n",
    "                             gradient_accumulation_steps=32,#！！！！设置梯度累加\n",
    "                             per_device_eval_batch_size=1,#！！！测试集 dataloader 每个批次大小\n",
    "                             gradient_checkpointing=True, #! ! !通过 以时间换空间 的策略，显著减少了激活值的显存占用\n",
    "                             optim=\"adafactor\", #！！！切换adafactor\n",
    "                             num_train_epochs=1, #总轮数\n",
    "                             logging_steps=10, # 每十步进行一次打印\n",
    "                             #epoch:每一轮做一次评估\n",
    "                             #steps，需要搭配eval_steps=xxx使用:没xxx步评估一次\n",
    "                             evaluation_strategy=\"epoch\",\n",
    "                             #保存策略\n",
    "                             save_strategy=\"epoch\",#每一轮保存一下模型\n",
    "                             save_total_limit=3,#限制最多保存5个模型\n",
    "                             learning_rate=2e-5,\n",
    "                             weight_decay=0.01,#梯度衰减\n",
    "                             metric_for_best_model=\"f1\",#设置评估指标\n",
    "                             load_best_model_at_end=True#最后默认加载最好的 模型 \n",
    "                            #  optim= 设置优化器\n",
    "                             )\n",
    "train_args#查看参数\n",
    "from transformers import DataCollatorWithPadding\n",
    "\"\"\"传入模型，训练参数，数据集\"\"\"\n",
    "trainer=Trainer(model=model,args=train_args,train_dataset=tokenized_datasets[\"train\"],\n",
    "                eval_dataset=tokenized_datasets[\"test\"],\n",
    "                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                compute_metrics=eval_metric \n",
    "                )\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "627b1969",
   "metadata": {},
   "source": [
    "## Freeze Model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09d1918c",
   "metadata": {},
   "source": [
    "冻结BERT参数，训练全连接层参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "aa5261c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c1d08ccd4cff4e5b92734f73dbaedbd8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d8fa5fb12c90449ebcdc34870cfd8445",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "53b82ed7c1664b3ea3837b7996b04c6b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/218 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
      "z:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\utils\\checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
      "  warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 0.6742, 'learning_rate': 1.9082568807339454e-05, 'epoch': 0.05}\n",
      "{'loss': 0.6594, 'learning_rate': 1.81651376146789e-05, 'epoch': 0.09}\n",
      "{'loss': 0.6703, 'learning_rate': 1.724770642201835e-05, 'epoch': 0.14}\n",
      "{'loss': 0.6452, 'learning_rate': 1.63302752293578e-05, 'epoch': 0.18}\n",
      "{'loss': 0.6306, 'learning_rate': 1.541284403669725e-05, 'epoch': 0.23}\n",
      "{'loss': 0.6343, 'learning_rate': 1.4495412844036698e-05, 'epoch': 0.27}\n",
      "{'loss': 0.6145, 'learning_rate': 1.3577981651376149e-05, 'epoch': 0.32}\n",
      "{'loss': 0.6193, 'learning_rate': 1.2660550458715597e-05, 'epoch': 0.37}\n",
      "{'loss': 0.6173, 'learning_rate': 1.1743119266055047e-05, 'epoch': 0.41}\n",
      "{'loss': 0.6303, 'learning_rate': 1.0825688073394496e-05, 'epoch': 0.46}\n",
      "{'loss': 0.5943, 'learning_rate': 9.908256880733946e-06, 'epoch': 0.5}\n",
      "{'loss': 0.6529, 'learning_rate': 8.990825688073395e-06, 'epoch': 0.55}\n",
      "{'loss': 0.6046, 'learning_rate': 8.073394495412845e-06, 'epoch': 0.6}\n",
      "{'loss': 0.6433, 'learning_rate': 7.155963302752295e-06, 'epoch': 0.64}\n",
      "{'loss': 0.6195, 'learning_rate': 6.238532110091744e-06, 'epoch': 0.69}\n",
      "{'loss': 0.6, 'learning_rate': 5.3211009174311936e-06, 'epoch': 0.73}\n",
      "{'loss': 0.6432, 'learning_rate': 4.403669724770643e-06, 'epoch': 0.78}\n",
      "{'loss': 0.6254, 'learning_rate': 3.486238532110092e-06, 'epoch': 0.82}\n",
      "{'loss': 0.6043, 'learning_rate': 2.5688073394495415e-06, 'epoch': 0.87}\n",
      "{'loss': 0.6443, 'learning_rate': 1.6513761467889911e-06, 'epoch': 0.92}\n",
      "{'loss': 0.6196, 'learning_rate': 7.339449541284405e-07, 'epoch': 0.96}\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "19674ba519244ed196173c309c5d9106",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/777 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'eval_loss': 0.6170468330383301, 'eval_accuracy': 0.6898326898326899, 'eval_f1': 0.8153256704980842, 'eval_runtime': 54.3558, 'eval_samples_per_second': 14.295, 'eval_steps_per_second': 14.295, 'epoch': 1.0}\n",
      "{'train_runtime': 529.553, 'train_samples_per_second': 13.196, 'train_steps_per_second': 0.412, 'train_loss': 0.6307064043272526, 'epoch': 1.0}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=218, training_loss=0.6307064043272526, metrics={'train_runtime': 529.553, 'train_samples_per_second': 13.196, 'train_steps_per_second': 0.412, 'train_loss': 0.6307064043272526, 'epoch': 1.0})"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第一步到第五步\n",
    "from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments\n",
    "from datasets import load_dataset\n",
    "dataset=load_dataset(\"csv\",data_files=\"../tst_fold\\ChnSentiCorp_htl_all copy.csv\",split=\"train\")\n",
    "dataset=dataset.filter(lambda x: x[\"review\"] is not None)\n",
    "datasets=dataset.train_test_split(test_size=0.1)\n",
    "import torch\n",
    "tokenizer=AutoTokenizer.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")\n",
    "def process_func(examples):\n",
    "    tokenized_examples=tokenizer(examples[\"review\"],max_length=128,truncation=True,padding=\"max_length\")\n",
    "    tokenized_examples[\"labels\"]=examples[\"label\"]\n",
    "    return tokenized_examples\n",
    "tokenized_datasets=datasets.map(process_func,batched=True,remove_columns=datasets[\"train\"].column_names)\n",
    "import evaluate\n",
    "\"\"\"评估函数\"\"\"\n",
    "acc_metrics=evaluate.load(r\"../evaluate_main\\metrics\\accuracy\")\n",
    "f1_metrics=evaluate.load(r\"../evaluate_main\\metrics\\f1\")\n",
    "def eval_metric(eval_predict):\n",
    "    predictions,labels=eval_predict\n",
    "    predictions=predictions.argmax(axis=-1)\n",
    "    acc=acc_metrics.compute(predictions=predictions,references=labels)\n",
    "    f1=f1_metrics.compute(predictions=predictions,references=labels)\n",
    "    acc.update(f1)\n",
    "    return acc\n",
    "\n",
    "\"\"\"创建模型\"\"\"\n",
    "model=AutoModelForSequenceClassification.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")#模型\n",
    "\n",
    "\n",
    "train_args=TrainingArguments(output_dir=\"./checkpoints_Trainer\",\n",
    "                             per_device_train_batch_size=1,#！！！！训练集 dataloader每个批次大小\n",
    "                             gradient_accumulation_steps=32,#！！！！设置梯度累加\n",
    "                             per_device_eval_batch_size=1,#！！！测试集 dataloader 每个批次大小\n",
    "                             gradient_checkpointing=True, #! ! !通过 以时间换空间 的策略，显著减少了激活值的显存占用\n",
    "                             optim=\"adafactor\", #！！！切换adafactor\n",
    "                             num_train_epochs=1, #总轮数\n",
    "                             logging_steps=10, # 每十步进行一次打印\n",
    "                             #epoch:每一轮做一次评估\n",
    "                             #steps，需要搭配eval_steps=xxx使用:没xxx步评估一次\n",
    "                             evaluation_strategy=\"epoch\",\n",
    "                             #保存策略\n",
    "                             save_strategy=\"epoch\",#每一轮保存一下模型\n",
    "                             save_total_limit=3,#限制最多保存5个模型\n",
    "                             learning_rate=2e-5,\n",
    "                             weight_decay=0.01,#梯度衰减\n",
    "                             metric_for_best_model=\"f1\",#设置评估指标\n",
    "                             load_best_model_at_end=True#最后默认加载最好的 模型 \n",
    "                            #  optim= 设置优化器\n",
    "                             )\n",
    "train_args#查看参数\n",
    "from transformers import DataCollatorWithPadding\n",
    "\"\"\"传入模型，训练参数，数据集\"\"\"\n",
    "# ！！！冻结操作\n",
    "for name,param in model.bert.named_parameters():\n",
    "    param.requires_grad=False\n",
    "    \n",
    "\n",
    "trainer=Trainer(model=model,args=train_args,train_dataset=tokenized_datasets[\"train\"],\n",
    "                eval_dataset=tokenized_datasets[\"test\"],\n",
    "                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                compute_metrics=eval_metric \n",
    "                )\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b6a3af4d",
   "metadata": {},
   "source": [
    "## Data Length "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "46f4a8da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e7c23bce82ba4dd7a3f0f53e15371058",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "623bdfb0be114287ba0f1a4364587aab",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:\\HuggFace_model\\chinese-macbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b9dbeba181c44d49918aee90a226ac7e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/218 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'loss': 0.6305, 'learning_rate': 1.9082568807339454e-05, 'epoch': 0.05}\n",
      "{'loss': 0.4352, 'learning_rate': 1.81651376146789e-05, 'epoch': 0.09}\n",
      "{'loss': 0.3756, 'learning_rate': 1.724770642201835e-05, 'epoch': 0.14}\n",
      "{'loss': 0.4125, 'learning_rate': 1.63302752293578e-05, 'epoch': 0.18}\n",
      "{'loss': 0.3112, 'learning_rate': 1.541284403669725e-05, 'epoch': 0.23}\n",
      "{'loss': 0.4704, 'learning_rate': 1.4495412844036698e-05, 'epoch': 0.27}\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[2], line 59\u001b[0m\n\u001b[0;32m     53\u001b[0m \u001b[38;5;124;03m\"\"\"传入模型，训练参数，数据集\"\"\"\u001b[39;00m\n\u001b[0;32m     54\u001b[0m trainer\u001b[38;5;241m=\u001b[39mTrainer(model\u001b[38;5;241m=\u001b[39mmodel,args\u001b[38;5;241m=\u001b[39mtrain_args,train_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrain\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     55\u001b[0m                 eval_dataset\u001b[38;5;241m=\u001b[39mtokenized_datasets[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m     56\u001b[0m                 data_collator\u001b[38;5;241m=\u001b[39mDataCollatorWithPadding(tokenizer\u001b[38;5;241m=\u001b[39mtokenizer),\n\u001b[0;32m     57\u001b[0m                 compute_metrics\u001b[38;5;241m=\u001b[39meval_metric \n\u001b[0;32m     58\u001b[0m                 )\n\u001b[1;32m---> 59\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1633\u001b[0m, in \u001b[0;36mTrainer.train\u001b[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[0;32m   1628\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[0;32m   1630\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[0;32m   1631\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[0;32m   1632\u001b[0m )\n\u001b[1;32m-> 1633\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1634\u001b[0m \u001b[43m    \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1635\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1636\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1637\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1638\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:1902\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[0;32m   1900\u001b[0m         tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining_step(model, inputs)\n\u001b[0;32m   1901\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1902\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1904\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m   1905\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[0;32m   1906\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[0;32m   1907\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[0;32m   1908\u001b[0m ):\n\u001b[0;32m   1909\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[0;32m   1910\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:2645\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[1;34m(self, model, inputs)\u001b[0m\n\u001b[0;32m   2642\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m   2644\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[1;32m-> 2645\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2647\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m   2648\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[1;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[0;32m   2675\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m   2676\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 2677\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs)\n\u001b[0;32m   2678\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[0;32m   2679\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[0;32m   2680\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:1562\u001b[0m, in \u001b[0;36mBertForSequenceClassification.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m   1554\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1555\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):\u001b[39;00m\n\u001b[0;32m   1556\u001b[0m \u001b[38;5;124;03m    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,\u001b[39;00m\n\u001b[0;32m   1557\u001b[0m \u001b[38;5;124;03m    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If\u001b[39;00m\n\u001b[0;32m   1558\u001b[0m \u001b[38;5;124;03m    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).\u001b[39;00m\n\u001b[0;32m   1559\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m   1560\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m-> 1562\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbert\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1563\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1564\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1565\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1566\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1567\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1568\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1569\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1570\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1571\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1572\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1574\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m   1576\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(pooled_output)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:1020\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m   1011\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m   1013\u001b[0m embedding_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membeddings(\n\u001b[0;32m   1014\u001b[0m     input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[0;32m   1015\u001b[0m     position_ids\u001b[38;5;241m=\u001b[39mposition_ids,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1018\u001b[0m     past_key_values_length\u001b[38;5;241m=\u001b[39mpast_key_values_length,\n\u001b[0;32m   1019\u001b[0m )\n\u001b[1;32m-> 1020\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1021\u001b[0m \u001b[43m    \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1022\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1023\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1024\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1025\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_extended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1026\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1027\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1028\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1029\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1030\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1031\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1032\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m   1033\u001b[0m pooled_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler(sequence_output) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpooler \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:601\u001b[0m, in \u001b[0;36mBertEncoder.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m    597\u001b[0m             \u001b[38;5;28;01mreturn\u001b[39;00m module(\u001b[38;5;241m*\u001b[39minputs, past_key_value, output_attentions)\n\u001b[0;32m    599\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m custom_forward\n\u001b[1;32m--> 601\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mutils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheckpoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheckpoint\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    602\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcreate_custom_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlayer_module\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    603\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    604\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    605\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    606\u001b[0m \u001b[43m        \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    607\u001b[0m \u001b[43m        \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    608\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    609\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    610\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m layer_module(\n\u001b[0;32m    611\u001b[0m         hidden_states,\n\u001b[0;32m    612\u001b[0m         attention_mask,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    617\u001b[0m         output_attentions,\n\u001b[0;32m    618\u001b[0m     )\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\utils\\checkpoint.py:249\u001b[0m, in \u001b[0;36mcheckpoint\u001b[1;34m(function, use_reentrant, *args, **kwargs)\u001b[0m\n\u001b[0;32m    246\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnexpected keyword arguments: \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m,\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(arg \u001b[38;5;28;01mfor\u001b[39;00m arg \u001b[38;5;129;01min\u001b[39;00m kwargs))\n\u001b[0;32m    248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_reentrant:\n\u001b[1;32m--> 249\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mCheckpointFunction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpreserve\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    250\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    251\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _checkpoint_without_reentrant(\n\u001b[0;32m    252\u001b[0m         function,\n\u001b[0;32m    253\u001b[0m         preserve,\n\u001b[0;32m    254\u001b[0m         \u001b[38;5;241m*\u001b[39margs,\n\u001b[0;32m    255\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m    256\u001b[0m     )\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\utils\\checkpoint.py:107\u001b[0m, in \u001b[0;36mCheckpointFunction.forward\u001b[1;34m(ctx, run_function, preserve_rng_state, *args)\u001b[0m\n\u001b[0;32m    104\u001b[0m ctx\u001b[38;5;241m.\u001b[39msave_for_backward(\u001b[38;5;241m*\u001b[39mtensor_inputs)\n\u001b[0;32m    106\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[1;32m--> 107\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m \u001b[43mrun_function\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    108\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:597\u001b[0m, in \u001b[0;36mBertEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forward\u001b[1;34m(*inputs)\u001b[0m\n\u001b[0;32m    596\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mcustom_forward\u001b[39m(\u001b[38;5;241m*\u001b[39minputs):\n\u001b[1;32m--> 597\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodule\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:537\u001b[0m, in \u001b[0;36mBertLayer.forward\u001b[1;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[0;32m    534\u001b[0m     cross_attn_present_key_value \u001b[38;5;241m=\u001b[39m cross_attention_outputs[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m    535\u001b[0m     present_key_value \u001b[38;5;241m=\u001b[39m present_key_value \u001b[38;5;241m+\u001b[39m cross_attn_present_key_value\n\u001b[1;32m--> 537\u001b[0m layer_output \u001b[38;5;241m=\u001b[39m \u001b[43mapply_chunking_to_forward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    538\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeed_forward_chunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchunk_size_feed_forward\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mseq_len_dim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattention_output\u001b[49m\n\u001b[0;32m    539\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    540\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (layer_output,) \u001b[38;5;241m+\u001b[39m outputs\n\u001b[0;32m    542\u001b[0m \u001b[38;5;66;03m# if decoder, return the attn key/values as the last output\u001b[39;00m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\pytorch_utils.py:248\u001b[0m, in \u001b[0;36mapply_chunking_to_forward\u001b[1;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[0;32m    245\u001b[0m     \u001b[38;5;66;03m# concatenate output at same dimension\u001b[39;00m\n\u001b[0;32m    246\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mcat(output_chunks, dim\u001b[38;5;241m=\u001b[39mchunk_dim)\n\u001b[1;32m--> 248\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minput_tensors\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:550\u001b[0m, in \u001b[0;36mBertLayer.feed_forward_chunk\u001b[1;34m(self, attention_output)\u001b[0m\n\u001b[0;32m    548\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfeed_forward_chunk\u001b[39m(\u001b[38;5;28mself\u001b[39m, attention_output):\n\u001b[0;32m    549\u001b[0m     intermediate_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mintermediate(attention_output)\n\u001b[1;32m--> 550\u001b[0m     layer_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput\u001b[49m\u001b[43m(\u001b[49m\u001b[43mintermediate_output\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattention_output\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m layer_output\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\transformers\\models\\bert\\modeling_bert.py:462\u001b[0m, in \u001b[0;36mBertOutput.forward\u001b[1;34m(self, hidden_states, input_tensor)\u001b[0m\n\u001b[0;32m    461\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, hidden_states: torch\u001b[38;5;241m.\u001b[39mTensor, input_tensor: torch\u001b[38;5;241m.\u001b[39mTensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m torch\u001b[38;5;241m.\u001b[39mTensor:\n\u001b[1;32m--> 462\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdense\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    463\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(hidden_states)\n\u001b[0;32m    464\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mLayerNorm(hidden_states \u001b[38;5;241m+\u001b[39m input_tensor)\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m   1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1193\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1194\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28minput\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[1;32mz:\\ANACONDA\\envs\\transformers\\lib\\site-packages\\torch\\nn\\modules\\linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m    113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[1;32m--> 114\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "# 第一步到第五步\n",
    "from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments\n",
    "from datasets import load_dataset\n",
    "dataset=load_dataset(\"csv\",data_files=\"../tst_fold\\ChnSentiCorp_htl_all copy.csv\",split=\"train\")\n",
    "dataset=dataset.filter(lambda x: x[\"review\"] is not None)\n",
    "datasets=dataset.train_test_split(test_size=0.1)\n",
    "import torch\n",
    "tokenizer=AutoTokenizer.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")\n",
    "def process_func(examples):\n",
    "# ！！！更改最大长度：128->32\n",
    "    tokenized_examples=tokenizer(examples[\"review\"],max_length=32,truncation=True,padding=\"max_length\")\n",
    "    tokenized_examples[\"labels\"]=examples[\"label\"]\n",
    "    return tokenized_examples\n",
    "tokenized_datasets=datasets.map(process_func,batched=True,remove_columns=datasets[\"train\"].column_names)\n",
    "import evaluate\n",
    "\"\"\"评估函数\"\"\"\n",
    "acc_metrics=evaluate.load(r\"../evaluate_main\\metrics\\accuracy\")\n",
    "f1_metrics=evaluate.load(r\"../evaluate_main\\metrics\\f1\")\n",
    "def eval_metric(eval_predict):\n",
    "    predictions,labels=eval_predict\n",
    "    predictions=predictions.argmax(axis=-1)\n",
    "    acc=acc_metrics.compute(predictions=predictions,references=labels)\n",
    "    f1=f1_metrics.compute(predictions=predictions,references=labels)\n",
    "    acc.update(f1)\n",
    "    return acc\n",
    "\n",
    "\"\"\"创建模型\"\"\"\n",
    "model=AutoModelForSequenceClassification.from_pretrained(r\"E:\\HuggFace_model\\chinese-macbert-large\")#模型\n",
    "\n",
    "\n",
    "train_args=TrainingArguments(output_dir=\"./checkpoints_Trainer\",\n",
    "                             per_device_train_batch_size=1,#！！！！训练集 dataloader每个批次大小\n",
    "                             gradient_accumulation_steps=32,#！！！！设置梯度累加\n",
    "                             per_device_eval_batch_size=1,#！！！测试集 dataloader 每个批次大小\n",
    "                             gradient_checkpointing=True, #! ! !通过 以时间换空间 的策略，显著减少了激活值的显存占用\n",
    "                             optim=\"adafactor\", #！！！切换adafactor\n",
    "                             num_train_epochs=1, #总轮数\n",
    "                             logging_steps=10, # 每十步进行一次打印\n",
    "                             #epoch:每一轮做一次评估\n",
    "                             #steps，需要搭配eval_steps=xxx使用:没xxx步评估一次\n",
    "                             evaluation_strategy=\"epoch\",\n",
    "                             #保存策略\n",
    "                             save_strategy=\"epoch\",#每一轮保存一下模型\n",
    "                             save_total_limit=3,#限制最多保存5个模型\n",
    "                             learning_rate=2e-5,\n",
    "                             weight_decay=0.01,#梯度衰减\n",
    "                             metric_for_best_model=\"f1\",#设置评估指标\n",
    "                             load_best_model_at_end=True#最后默认加载最好的 模型 \n",
    "                            #  optim= 设置优化器\n",
    "                             )\n",
    "train_args#查看参数\n",
    "from transformers import DataCollatorWithPadding\n",
    "\"\"\"传入模型，训练参数，数据集\"\"\"\n",
    "trainer=Trainer(model=model,args=train_args,train_dataset=tokenized_datasets[\"train\"],\n",
    "                eval_dataset=tokenized_datasets[\"test\"],\n",
    "                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                compute_metrics=eval_metric \n",
    "                )\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f6f66c4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "transformers",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
